Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
There seem to be two different behaviours in mean expression by gene
A sort of heavy right tail in the samples’ mean expression, although the difference doesn’t seem to be that big (x axis scale)
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
theme_minimal() + ggtitle('Mean expression density by gene (left) and by Sample (right)'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
The two groups of genes seem to be partially characterised by genes with Neuronal function. Maybe a more thorough analysis of functional annotations could help characterise this two groups better.
The heavy right tale from the sample distribution corresponds to a group of Autism samples. In general, the autism group has a bigger variance than the control group
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>%
left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
scale_x_log10() + theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression density by gene')
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis_, fill=Diagnosis_)) + geom_density(alpha=0.3) +
theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression density by Sample')
grid.arrange(p1, p2, nrow=1)
rm(GO_annotations, GO_neuronal, plot_data, p1, p2)
In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene.
plot_data = data.frame('ID'=rownames(datExpr),
'ASD'=rowMeans(datExpr[,datMeta$Diagnosis_=='ASD']),
'CTL'=rowMeans(datExpr[,datMeta$Diagnosis_!='ASD']))
plot_data %>% ggplot(aes(ASD,CTL)) + geom_point(alpha=0.1, color='#0099cc') +
geom_abline(color='gray') + ggtitle('Mean expression ASD vs CTL') + theme_minimal()
There doesn’t seem to be a noticeable difference between mean expression by gene between diagnosis groups
Samples with autism tend to have a wider dispersion of mean expression with higher values than the control group (as we had already seen before)
plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis_=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis_!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())
plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis_=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis_!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + theme_minimal() +
ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
The first principal component seems to separate almost perfectly the two diagnosis
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis_') %>%
mutate('Diagnosis_'=factor(Diagnosis_, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point() + theme_minimal() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Looks exactly the same as the PCA visualisation, just inverting the y-axis
d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('C1','C2','Diagnosis_') %>%
mutate('Diagnosis_'=factor(Diagnosis_, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(C1, C2, color=Diagnosis_)) + geom_point() + theme_minimal() + ggtitle('MDS')
rm(d, fit, plot_data)
Higher perplexities seem to capture the difference between diagnosis better, although at the end they seem to be capturing another pattern as well, since the samples seem to be grouped in pairs
perplexities = c(5,10,15,20)
ps= list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('C1','C2','Diagnosis_','Subject_ID') %>%
mutate('Diagnosis_'=factor(Diagnosis_, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis_)) + geom_point() + theme_minimal() +
ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(ps, perplexities, tsne)
The second pattern t-SNE seems to be capturing is the subject the samples belong to!
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() +
theme(legend.position='none') + ggtitle('t-SNE Perplexity=20 coloured by Subject ID'))
rm(plot_data)
First Principal Component explains almost 99% of the total variance
There’s a really strong correlation between the mean expression of a gene and the 1st principal component
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Distance matrix is too heavy to calculate and the resulting distance object is to big to even load (3.4GB)
t-SNE seems to group samples by their mean expression as PCA does, just in a less linear way
perplexities = c(1,2,5,10,50,100)
ps= list()
for(i in 1:length(perplexities)){
tsne = read.csv(paste0('./../Data/Gandal/Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() +
scale_color_viridis() + ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(ps, plot_data, perplexities, tsne, i)
3000 genes (~10%) are significant using a threshold of 0.05 for the adjusted p-value and a without a log Fold Change threshold (keeping the null hypothesis \(H_0: lfc=0\))
5838 genes don’t have an adjusted p-value because they have less mean normalized counts than the optimal threshold (https://support.bioconductor.org/p/76144/), so they are going to be considered not to be significant
table(DE_info$padj<0.05, useNA='ifany')
##
## FALSE TRUE <NA>
## 21271 3000 5838
DE_info$significant = DE_info$padj<0.05 & !is.na(DE_info$padj)
p = DE_info[complete.cases(DE_info),] %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) +
scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)
rm(p)
DE_info %>% ggplot(aes(baseMean, log2FoldChange, color=significant)) + geom_point(alpha=0.3) +
geom_smooth(aes(baseMean, log2FoldChange), method='lm', inherit.aes=FALSE, color='gray') +
scale_x_log10() + theme_minimal()
When filtering for differential expression, the points seem to separate ino two clouds depending on whether they are over or underexpressed
The top cloud corresponds to the over expressed genes and the bottom to the under expressed ones
datExpr_DE = datExpr[DE_info$significant,]
pca = datExpr_DE %>% prcomp
plot_data = cbind(data.frame('ID'=rownames(datExpr_DE), 'PC1'=pca$x[,1], 'PC2'=pca$x[,2]),
DE_info[DE_info$significant==TRUE,])
pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.02, pos_zero, pos_zero+0.02, 1)) +
theme_minimal() + ggtitle('PCA of significant genes') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)
rm(pos_zero, p)
Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed
intercept=-0.1
slope=0.018
plot_data = plot_data %>% mutate('Group'=ifelse(PC2>slope*PC1+intercept,'overexpressed','underexpressed')) %>%
mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))
plot_data %>% ggplot(aes(PC1, PC2, color=Group)) + geom_point(alpha=0.4) +
geom_abline(aes(intercept=intercept, slope=slope), color='gray') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA')
rm(pca)
Plotting the mean expression by group they seem to have two and three underlying distributions, respectively, so a Gaussian Mixture Model was fitted to each one to separate them into two/three Gaussians and then the points corresponding to each one were plotted in the original PCA plot.
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
tot_n_clusters = 5
plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))
GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(3)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
'Membership'=GMM_G1$Log_likelihood %>% apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
'Membership'=GMM_G2$Log_likelihood %>% apply(1, function(x) glue('under_', which.max(x))))
plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')
p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) +
theme_minimal() + theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(x=MeanExpr)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
args=list(mean=GMM_G1$centroids[3], sd=GMM_G1$covariance_matrices[3])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[5],
args=list(mean=GMM_G2$centroids[2], sd=GMM_G2$covariance_matrices[2])) +
theme_minimal()
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() +
theme(legend.position='bottom')
grid.arrange(p1, p2, p3, nrow=1)
rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2)
## Warning in rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, :
## object 'gg_color_hue' not found
## Warning in rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, :
## object 'n_clusters' not found
For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any distinguishible patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.
plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal()
rm(plot_data)
fc_list = seq(1, 1.2, 0.01)
n_genes = nrow(datExpr)
# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)
# Initialice DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=rownames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
pca_genes_old = pcas_genes
for(fc in fc_list){
# Recalculate DE_info with the new threshold (p-values change) an filter DE genes
DE_genes = results(dds, lfcThreshold=log2(fc), altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID'=rownames(.)) %>% filter(padj<0.05)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
pca_genes_new$PC1 = -pca_genes_new$PC1
}
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
pca_genes_new$PC2 = -pca_genes_new$PC2
}
pca_samps_old = pca_samps_new
pca_genes_old = pca_genes_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
pcas_genes = rbind(pcas_genes, pca_genes_new)
}
# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select(ID, PC1, PC2, fc, Diagnosis_, Brain_lobe)
# pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>%
# mutate('score'=as.factor(`gene-score`)) %>%
# dplyr::select(ID, PC1, PC2, lfc, score)
# Plot change of number of genes
ggplotly(data.frame('fc'=fc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=fc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() + xlab('Fold Change') +
ggtitle('Number of remaining genes when modifying filtering threshold'))
Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames
The higher the filtering threshold, the more concentrated the Control samples seem to get
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=fc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
There doesn’t seem to be any recognisable pattern
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=fc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
ggplotly(pcas_genes %>% ggplot(aes(PC1, PC2)) + geom_point(aes(frame=fc, ids=ID, alpha=0.3)) +
theme_minimal() + ggtitle('Genes PCA plot modifying filtering threshold'))